# -*- coding:UTF-8 -*-
"""
@Project:   DataCrawler
@FileName:  tianyancha.py 
@CreateDate:2023/2/19 17:54  
@Author:    Jia  
@Desc: 在天眼查抓取湖南长沙私募基金公司名称，法人代表，电话
"""
import time

import aiohttp
import asyncio
import csv
from lxml import etree
import pandas as pd
from time import sleep
from selenium import webdriver
from Config import log
from selenium.webdriver.common.by import By

logger = log.Logs().debug_logger()


# 爬取第一层数据，查询符合条件的公司共27页
async def first_floor(url):
    # 设置最高并发数量为5
    async with asyncio.Semaphore(5):
        async with aiohttp.ClientSession() as session:
            headers = {
                ':authority': 'www.tianyancha.com',
                'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64;x64) '
                              'AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',
                'referer': 'https://www.tianyancha.com/search?key=私募&sessionNo=1677167401.22011126&base=hun'
                           '&cacheCode=00430100V2020&city=zhangsha&moneyStart=1000&moneyEnd=null&orgType=1',
                'cookie': 'jsid=SEO-BING-ALL-SY-000001; TYCID=9958cb90ad4711ed879cc11b37959821; ssuid=2172353707;'
                          ' _ga=GA1.2.105279380.1676548396; HWWAFSESID=ee99024461a73e630c4; HWWAFSESTIME=1676904982728;'
                          ' csrfToken=uvEEqylmy5bMFV94KbH1zmwW; bdHomeCount=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758='
                          '1676475855,1676547458,1676904984; sensorsdata2015jssdkcross={"distinct_id":"273629667",'
                          '"first_id":"18655bf70e43e3-0d11c4dd1817098-7d5d547c-1327104-18655bf70e5a5c",'
                          '"props":{"$latest_traffic_source_type":"直接流量","$latest_search_keyword":"未取到值_直接打开",'
                          '"$latest_referrer":""},"identities":"eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg2NTViZjcwZTQzZTMt'
                          'MGQxMWM0ZGQxODE3MDk4LTdkNWQ1NDdjLTEzMjcxMDQtMTg2NTViZjcwZTVhNWMiLCIkaWRlbnRpdHlfbG9naW5fa'
                          'WQiOiIyNzM2Mjk2NjcifQ==",'
                          '"history_login_id":{"name":"$identity_login_id","value":"273629667"},'
                          '"$device_id":"18655bf70e43e3-0d11c4dd1817098-7d5d547c-1327104-18655bf70e5a5c"}; '
                          'bannerFlag=true;'
                          ' RTYCID=5570e9487843452ca97e0fa0a08b5b92; cloud_token=351c319792c34f20bcaeebe62d1df473;'
                          ' tyc-user-info={"state":"4","vipManager":"1","mobile":"18818075170","isExpired":"0"};'
                          ' tyc-user-info-save-time=1677164530378; '
                          'auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODgxODA3NT'
                          'E3MCIsImlhdCI6MTY3NzE2NDUzMCwiZXhwIjoxNjc5NzU2NTMwfQ.'
                          'Vj2pkV7lMGbqvhyoeY9uLvkIuR23RoZD_PFietdJVRfHCaVZnQ3HixhjkaJAOJiNY5ZcV_3F9b27zGW_8afg4A; '
                          '_gid=GA1.2.758636769.1677165478;'
                          ' searchSessionId=1677167452.53527766; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1677167461'
            }
            response = await session.get(url, headers=headers)
            res_text = await response.text()

            # 保存请求结果debug
            # with open('./Data/res_text7.html', 'w', encoding='utf-8') as wf:
            #     wf.write(res_text)

            # 解析公司名和URL
            html = etree.HTML(res_text)
            # html = etree.parse('res_text.html')
            company_name = html.xpath('//div[@class="index_name__qEdWi"]/a/span/text()[1]')
            company_url = html.xpath('//div[@class="index_name__qEdWi"]//a[starts-with(@href,"https://")]/@href')

            # 保存到CSV
            data = {
                "私募基金公司": company_name,
                "URL": company_url
            }
            df = pd.DataFrame(data)
            df.to_csv('./Data/company_name.csv', mode='a', index=False, header=True)  # 追加模式，每行数据不加序号，要标题


# 爬取第二层数据，解析合伙人
async def second_floor(url, undone_url):

    """因为源码中没有加载出完整的数据，所以开始用selenium操作"""
    option = webdriver.ChromeOptions()

    # 屏蔽V76以上自动化受控提示 ，开发者提示
    option.add_experimental_option('useAutomationExtension', False)
    option.add_experimental_option("excludeSwitches", ['enable-automation'])

    # 不自动关闭浏览器
    option.add_experimental_option('detach', True)
    driver = webdriver.Chrome(options=option)

    # 如果URL只是一个标题将跳过
    if url == 'URL':
        return 0
    else:
        # 使用selenium驱动chrome浏览器
        driver.get(url)
        driver.maximize_window()

        # 等待5秒，让页面加载完全
        sleep(4)

        try:
            company_name = driver.find_element(By.XPATH, '//h1[@class="index_company-name__LqKlo"]').text
            if company_name is not None:
                """使用java_script让浏览器滚动到合适位置；参数：左边距，上边距,smooth平滑划动"""
                js = 'window.scrollTo({left:0, top:2500, behavior: "smooth"});'
                driver.execute_script(js)

                '''使用find_elements获取多个匹配元素值，其返回值是一个element对象不能直接使用'''
                href_list = []
                partner_list = []
                href = driver.find_elements(By.XPATH, '//a[@class="index_lazy-img-toco__EU_FE link-click"]')

                # 输出合伙人名字和链接
                for h in href:
                    link = h.get_property('href')
                    name = h.get_property('text')
                    href_list.append(link)
                    partner_list.append(name)
                    # print(link, name)

                # 保存结果到CSV
                data = {'合伙公司': company_name,
                        '合伙人': partner_list,
                        'URL': href_list}
                p = pd.DataFrame(data)
                p.to_csv('../Data/company_partner.csv', mode='a', index=False, header=True)
                logger.info(f'已完成- {company_name}')

            else:
                undone_url.append(url)
        except Exception as error:
            logger.debug(f'错误信息:{error}')
        finally:
            # 关闭浏览器
            driver.close()


#  爬取第三层数据，合伙人手机号
async def third_floor(url):
    head = {}
    async with aiohttp.ClientSession() as session:
        response = await session.get(url, headers=head)
        res_text = await response.text()


# 临时 调试
async def test():
    logger.info('测试一下')
    await asyncio.sleep(3)


# 执行第一层数据爬取
async def main():
    url_format = 'https://www.tianyancha.com/search?key=私募' \
                 '&sessionNo=1677167452.53527766&moneyStart=1000' \
                 '&moneyEnd=null&orgType=1&base=hun&cacheCode=00430100V2020' \
                 '&city=zhangsha&pageNum={}'
    url_list = [url_format.format(i) for i in range(21, 28)]
    tasks = [first_floor(url) for url in url_list]
    await asyncio.gather(*tasks)


# 执行第二层数据爬取
async def main2():
    start = time.perf_counter()

    # 请求失败的URL
    undone_url = []

    # 读取第一层数据URL 逐一请求
    with open('../Data/company_name.csv', 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        column = [row['URL'] for row in reader]

    # 创建协程任务
    tasks = [second_floor(url, undone_url) for url in column[:11]]
    await asyncio.gather(*tasks)
    end = time.perf_counter()
    logger.info(f'未完成的URL{undone_url}')
    logger.info(f'本次程序运行耗时;{end-start}秒')


# 执行第三层数据爬取
async def main3():
    # 读取合伙人URL
    with open('../Data/company_partner.csv', 'r', encoding='utf-8') as ff:
        reader = csv.DictReader(ff)
        url_list = [row['URL'] for row in reader]


# 临时调式
async def main4():
    start = time.perf_counter()
    task = asyncio.create_task(test())
    await task
    end = time.perf_counter()
    logger.debug(f'本次运行耗时{end-start}秒')

if __name__ == '__main__':
    # 避免windows报错443
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

    loop = asyncio.get_event_loop()

    loop.run_until_complete(main3())
